#### PREAMBLE ####

library(ggplot2)
library(geosphere)
library(dplyr)
library(data.table)
library(tidyr)
library(zoo)
library(sf)
library(ggspatial)
library(RColorBrewer)
library(estimatr)
library(broom)
library(texreg)
library(did)


#### TEMPLATE ####
goldenScatterCAtheme <- theme(
  panel.background = element_rect(fill = "white"),
  aspect.ratio = ((1 + sqrt(5))/2)^(-1),
  axis.ticks.length = unit(0.5, "char"),
  axis.line.x.top = element_line(size = 0.2),
  axis.line.x.bottom = element_line(size = 0.2),
  axis.ticks.x = element_line(size = 0.2),
  axis.text.x = element_text(color = "black", size = 10),
  axis.title.x = element_text(size = 10, 
                              margin = margin(t = 7.5, r = 0, b = 0, l = 0)),
  axis.ticks.y = element_blank(),
  axis.text.y = element_text(color = "black", size = 10,
                             margin = margin(t = 0, r = -4, b = 0, l = 0)),
  axis.title.y = element_text(size = 10,
                              margin = margin(t = 0, r = 7.5, b = 0, l = 0)),
  #legend.key = element_rect(fill = NA, color = NA),
  panel.grid.major.x = element_blank(),
  panel.grid.major.y = element_line(color = "gray45", size = 0.2),
  strip.background = element_blank(),
  strip.text.x = element_text(size = 8),
  strip.text.y = element_blank(),
  strip.placement = "outside",
  panel.spacing.x = unit(1.25, "lines"),
  panel.spacing.y = unit(1, "lines")
)
class(goldenScatterCAtheme)
#### LOAD DATA ####

set.seed(20230711)

#load main data
df <- read.csv("Outputs/Analysis all muns.csv", stringsAsFactors = F, na.strings = c("", "NA"))
df$INEGI <- sprintf("%05d", df$INEGI)
df$INEGI <- as.character(df$INEGI)
df$time <- as.Date(df$time)


#### TRANSFORMATION ####

#states treated by US trade restrictions
unique(df$State[df$ustrade==1])
df$ustrade[df$time>=as.Date("2016-06-01") & df$trade==1 & (df$State=="MICHOACAN")] <- 1
#updating ustrade.
df$ustrade <- df$ustrade * df$trade


df$mh <- df$malicious_homicides
df$e <- df$extortion

#### LOAD AND JOIN PUBLIC EXPENDITURE DATA ####

# Load necessary library
library(data.table) # Using data.table for faster file reading

# Define the path to the folder containing the CSV files
folder_path <- "Source data/Public expenditure/conjunto_de_datos"

# Define a pattern to match files ending with the years 2011 to 2019
year_pattern <- paste0("(", paste(2011:2019, collapse = "|"), ")\\.csv$")

# Get a list of all CSV files in the folder that match the year pattern
csv_files <- list.files(path = folder_path, pattern = year_pattern, full.names = TRUE)

# Load all matching CSV files into a list
csv_list <- lapply(csv_files, fread) # fread is used from data.table package for faster reading

combined_data <- rbindlist(csv_list)
gc()

spend <- combined_data[combined_data$TEMA=="Egresos" & combined_data$DESCRIPCION_CATEGORIA=="Total de egresos",]
rm(combined_data, csv_list)
gc()

spend$ID_ENTIDAD <- sprintf("%02d", spend$ID_ENTIDAD)
spend$ID_MUNICIPIO <- sprintf("%03d", spend$ID_MUNICIPIO)
spend$INEGI <- paste0(spend$ID_ENTIDAD, spend$ID_MUNICIPIO)
spend <- spend[,c("INEGI", "ANIO", "VALOR")]
gc()
colnames(spend)[3] <- "Total_spending"

spend <- spend[spend$INEGI %in% df$INEGI,]

table(spend$INEGI, spend$ANIO)

#linear interpolation for missing data

# Load necessary libraries
library(dplyr)
library(tidyr)
library(zoo)

# Create a complete dataset with all INEGI and ANIO combinations
complete_spend <- expand.grid(
  INEGI = unique(spend$INEGI),
  ANIO = seq(min(spend$ANIO), max(spend$ANIO), by = 1)
)

# Join the complete dataset with the original dataset
spend_complete <- complete_spend %>%
  left_join(spend, by = c("INEGI", "ANIO"))

# Function to interpolate missing values
interpolate_missing <- function(df) {
  df %>%
    group_by(INEGI) %>%
    arrange(ANIO) %>%
    mutate(Total_spending = na.approx(Total_spending, na.rm = FALSE, rule = 2)) %>%
    ungroup()
}

# Apply the interpolation function
spend_imputed <- interpolate_missing(spend_complete)

spend <- spend_imputed
rm(spend_imputed, spend_complete, complete_spend)
gc()

df <- left_join(df, spend, by=c("Year"="ANIO", "INEGI"="INEGI"))


#### LOAD MISSING PERSONS DATA ####

mis <- read.csv("Source data/mis.csv", stringsAsFactors = F)

#combine state and municipality
#do same in df
df$state_mun <- paste(df$State, df$Municipality, sep="; ")
df_state_muns <- unique(df$state_mun)

#convert time to Date class
mis <- mis[mis$time!="NO ESPECIFICADO",]
mis <- mis[mis$time!="NO DISPONIBLE",]

mis$time <- as.Date(mis$time, format="%m/%d/%Y")

#subset df to only municipalities in mis
df_mis <- df[df$state_mun %in% mis$state_mun,]

#convert time to year month for both df_mis and mis
df_mis$time_mo <- format(df_mis$time, "%Y-%m")
mis$time_mo <- format(mis$time, "%Y-%m")

#sum missing persons in df
# Compute the counts
counts <- mis %>%
  group_by(state_mun, time_mo) %>%
  summarise(count = n())
# Merge the counts back to df_mis
df_mis <- df_mis %>%
  left_join(counts, by = c("state_mun", "time_mo"))
df_mis$mis_per <- NA_real_
df_mis$mis_per <- df_mis$count
df_mis$mis_per[is.na(df_mis$mis_per)] <- 0

#### TREATMENT TIMING HISTOGRAMS ####

### MEXICAN CERTIFICATION ###

treat_times <- read.csv("Source data/municipality_treatment_times.csv", stringsAsFactors = F, na.strings = "")

# Load necessary libraries
library(ggplot2)
library(lubridate)
library(dplyr)

# Convert the character dates to Date type
treat_times$document.date <- as.Date(treat_times$document.date, format="%Y-%m-%d")

# Extract the year from the date
treat_times$year <- year(treat_times$document.date)

# Filter out the year 2020 and later
treat_times_filtered <- treat_times %>% filter(year <= 2019)

nums <- 12

# Create a full sequence of years from 2001 to 2019
full_years <- data.frame(year = 2001:2019)

# Count the occurrences of each year in your data
year_counts <- as.data.frame(table(treat_times_filtered$year))
names(year_counts) <- c("year", "count")

# Merge the full sequence of years with the year counts, filling in zeros for missing years
complete_data <- merge(full_years, year_counts, by = "year", all.x = TRUE)
complete_data[is.na(complete_data)] <- 0  # Replace NA with 0

treat_time_hist <- ggplot(complete_data, aes(x = year, y = count)) +
  geom_bar(stat = "identity", fill = "grey83") +
  scale_x_continuous(breaks = 2001:2019) +
  scale_y_continuous(limits = c(0,15))+
  geom_vline(xintercept=2010.5, linetype=2) +
  labs(x="Year",
       y="Municipalities Earning Certification From the \nMexican Government Each Year") +
  theme_classic(base_size=15) +
  theme(axis.text = element_text(size = nums))
treat_time_hist

h <- 6
w <- 1.618*h
ggsave("Outputs/plots/Fig10_treatment_timing_histogram_mexican_cert.pdf", width=w, height=h)




### US TRADE ###




# Convert the character dates to Date type
treat_times$est_us_date <- as.Date(treat_times$est_us_date, format="%Y-%m-%d")

#set est_us_date to max of document.date and est_us_date
for(i in 1:nrow(treat_times)){
  treat_times$est_us_date[i] <- max(treat_times$document.date[i], treat_times$est_us_date[i], na.rm=T)
}

# Extract the year from the date
treat_times$year <- year(treat_times$est_us_date)

# Filter out the year 2020 and later
treat_times_filtered <- treat_times %>% filter(year <= 2019)

nums <- 12

# Create a full sequence of years from 2001 to 2019
full_years <- data.frame(year = 2001:2019)

# Count the occurrences of each year in your data
year_counts <- as.data.frame(table(treat_times_filtered$year))
names(year_counts) <- c("year", "count")

# Merge the full sequence of years with the year counts, filling in zeros for missing years
complete_data <- merge(full_years, year_counts, by = "year", all.x = TRUE)
complete_data[is.na(complete_data)] <- 0  # Replace NA with 0

treat_time_hist_us <- ggplot(complete_data, aes(x = year, y = count)) +
  geom_bar(stat = "identity", fill = "grey83") +
  scale_x_continuous(breaks = 2001:2019) +
  scale_y_continuous(limits = c(0,15))+
  geom_vline(xintercept=2010.5, linetype=2) +
  labs(x="Year",
       y="Municipalities Gaining Trade With the \nU.S. Each Year") +
  theme_classic(base_size=15) +
  theme(axis.text = element_text(size = nums))
treat_time_hist_us

h <- 6
w <- 1.618*h
ggsave("Outputs/plots/Fig11_treatment_timing_histogram_us_trade.pdf", width=w, height=h)



#

#### DIDs WITH WHOLE COUNTRY ####

df$Total_spending <- df$Total_spending/1000000

#there are three municipalities which drop from the data starting in 2012. remove them
t <- as.data.frame(table(df$time, df$INEGI))
t$Var1 <- as.character(t$Var1)
t$Var2 <- as.character(t$Var2)
temp1 <- t$Var2[(t$Var1=="2011-12-01" & t$Freq==1)]
temp2 <- t$Var2[(t$Var1=="2012-01-01" & t$Freq==0)]
temp <- temp1[temp1 %in% temp2]
df <- df[!df$INEGI %in% temp,]
df_mis <- df_mis[!df_mis$INEGI %in% temp,]

### INTENTIONAL HOMICIDES ###

lm1mh <- lm_robust(mh ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")
summary(lm1mh)


lm1h <- lm_robust(homicides ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                  fixed_effects = ~factor(time),
                  se_type="stata")
summary(lm1h)


lm1gh <- lm_robust(gun_homicides ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")
summary(lm1gh)


lm1mp <- lm_robust(mis_per ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df_mis, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")
summary(lm1mp)



texreg(l=list(lm1mh, lm1h, lm1gh, lm1mp), include.ci=F, include.rmse=F, omit.coef = "factor", file="Outputs/TA9_wholecountryregs.txt")


#do robberies and extortion
colnames(df)
lm1hr <- lm_robust(home_robbery ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")
lm1br <- lm_robust(business_robbery ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")
lm1tr <- lm_robust(transport_robbery ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")
lm1pr <- lm_robust(personal_robbery ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")
lm1e <- lm_robust(extortion ~ trade + ustrade + Total_spending + time*factor(INEGI), data=df, clusters=INEGI,
                   fixed_effects = ~factor(time),
                   se_type="stata")

summary(lm1hr)
summary(lm1br)
summary(lm1tr)
summary(lm1pr)
summary(lm1e)

texreg(l=list(lm1hr, lm1br, lm1tr, lm1pr, lm1e), include.ci=F, include.rmse=F, omit.coef = "factor", file="Outputs/TA10_wholecountryregs_nonviolent.txt")


#